#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
import sys, argparse
import csv
import time as t
import datetime as d
import numpy as np
import pandas as pd
'''
通过随机抽样正例解决平衡问题
'''

def parse_args():
    if len(sys.argv) == 1:
        sys.argv.append('-h')

    parser = argparse.ArgumentParser()
    parser.add_argument('train_feature')
    parser.add_argument('n_parts_train')
    parser.add_argument('prefix')
    args = vars(parser.parse_args())
    return args

# 数据中 pos/neg = 4:1

#TODO shuffle the original train

# 输入是完整的特征
args = parse_args()
train_feature, n_parts_train, prefix = args['train_feature'], int(args['n_parts_train']), args['prefix']

train = pd.read_csv(train_feature)

neg_rcds = train[train.label == 0]
pos_rcds = train[train.label == 1]

print 'pos/neg:\t', pos_rcds.shape[0] / neg_rcds.shape[0] 

pos_parts = np.array_split(pos_rcds, n_parts_train)

# append postive parts with negative records

for no, part in enumerate(pos_parts):
    path = "%s.%d" % (prefix, no)
    data = pd.concat([part, neg_rcds], axis=0)
    data.to_csv(path, index=False)
